Skip to content

Commit

Permalink
Merge pull request #64 from openpharma/dev
Browse files Browse the repository at this point in the history
#62 and snowflake compat…
  • Loading branch information
erblast authored Sep 25, 2024
2 parents acac522 + ec5c545 commit 5bdd121
Show file tree
Hide file tree
Showing 11 changed files with 50 additions and 201 deletions.
1 change: 0 additions & 1 deletion NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,6 @@ export("%>%")
export(aggr_duplicated_visits)
export(check_df_visit)
export(eval_sites)
export(eval_sites_deprecated)
export(exp_implicit_missing_visits)
export(get_config)
export(get_ecd_values)
Expand Down
1 change: 1 addition & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
- https://github.com/openpharma/simaerep/issues/59
- https://github.com/openpharma/simaerep/issues/16
- update vignettes
- https://github.com/openpharma/simaerep/issues/62

# simaerep 0.5.0
- allow flexible AE rates in data simulations
Expand Down
6 changes: 6 additions & 0 deletions R/S3_simaerep.R
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,7 @@ validate_simaerep <- function(x) {
#' Default: TRUE.
#'@param inframe Logical, only table operations to be used; does not require
#' visit_med75. Compatible with dbplyr supported database backends.
#'@param mult_corr Logical, multiplicity correction, Default: TRUE
#'@param param_site_aggr List of parameters passed to [site_aggr()]. Default:
#' list(method = "med75_adj", min_pat_pool = 0.2).
#'@param param_sim_sites List of parameters passed to [sim_sites()]. Default:
Expand Down Expand Up @@ -133,6 +134,7 @@ simaerep <- function(df_visit,
visit_med75 = TRUE,
inframe = FALSE,
progress = TRUE,
mult_corr = TRUE,
param_site_aggr = list(
method = "med75_adj",
min_pat_pool = 0.2
Expand Down Expand Up @@ -168,6 +170,10 @@ simaerep <- function(df_visit,
param_sim_sites$under_only <- under_only
param_eval_sites$under_only <- under_only

if (! mult_corr) {
param_eval_sites$method <- NA
}

if (visit_med75 && ! inframe) {

if (r != param_sim_sites$r) {
Expand Down
18 changes: 17 additions & 1 deletion R/inframe.R
Original file line number Diff line number Diff line change
Expand Up @@ -137,7 +137,23 @@ sim_inframe <- function(df_visit, r = 1000, df_site = NULL) {
mutate(
# we generate a random number between 1 and the maximum number
# of eligible patients
rnd = runif(n()),
rnd = runif(n())
)

if (inherits(df_sim_prep, "tbl_Snowflake")) {
# snowflake RANDOM() works differently than other backends and returns large
# positive and negative integers. We normalize by using the min and max values
# of those 64bit integers. Source for integer values is ChatGPT.
# for a range from -5 to 5: 4 -> 0.0 -4 -> 0.1, with underlying formula

df_sim_prep <- df_sim_prep %>%
mutate(
rnd = (.data$rnd - (-9223372036854775808)) / (9223372036854775807 - (-9223372036854775808))
)
}

df_sim_prep <- df_sim_prep %>%
mutate(
# transform to values between 1 and max number of patients
rnd_rwn = floor(.data$rnd * .data$max_rwn) + 1
)
Expand Down
119 changes: 4 additions & 115 deletions R/simaerep.R
Original file line number Diff line number Diff line change
Expand Up @@ -380,9 +380,6 @@ get_visit_med75 <- function(df_pat,
#' df_eval <- eval_sites(df_sim_sites)
#' df_eval
#'
#' # use deprecated method -------
#' df_eval <- eval_sites(df_sim_sites, method = NULL, r_sim_sites = 100)
#' df_eval
#' @rdname eval_sites
#' @seealso \code{\link[simaerep]{site_aggr}},
#' \code{\link[simaerep]{sim_sites}},
Expand All @@ -393,13 +390,6 @@ eval_sites <- function(df_sim_sites,
under_only = TRUE,
...) {


if (is.null(method)) {
warning("using deprecated method for probability adjustment")
df_out <- eval_sites_deprecated(df_sim_sites, ...)
return(df_out)
}

df_out <- df_sim_sites

if ("pval" %in% colnames(df_out)) {
Expand Down Expand Up @@ -458,6 +448,10 @@ eval_sites <- function(df_sim_sites,
#'@keywords internal
p_adjust <- function(df, col, suffix, method = "BH") {

if (is.na(method) || is.null(method) || method %in% c("None", "none")) {
return(df)
}

if (inherits(df, "data.frame")) {

col_adj <- paste0(col, "_adj")
Expand Down Expand Up @@ -515,111 +509,6 @@ get_any_na <- function(df, col) {
})
}

#' @title Evaluate sites.
#' @description Correct under-reporting probabilities by the expected number of
#' false positives (fp). This has been deprecated in favor of more conventional
#' methods available via \code{\link[stats]{p.adjust}}.
#' @param df_sim_sites dataframe generated by [sim_sites()][sim_sites()]
#' @param r_sim_sites integer, number of repeats for bootstrap resampling for site
#' simulation, needed for zero probability correction for fp calculation, Default: 1000
#' @return dataframe with the following columns:
#' \describe{
#' \item{**study_id**}{study identification}
#' \item{**site_number**}{site identification}
#' \item{**visit_med75**}{median(max(visit)) * 0.75}
#' \item{**mean_ae_site_med75**}{mean AE at visit_med75 site level}
#' \item{**mean_ae_study_med75**}{mean AE at visit_med75 study level}
#' \item{**pval**}{p-value as returned by `poisson.test`}
#' \item{**prob_low**}{bootstrapped probability for having mean_ae_site_med75 or lower}
#' \item{**n_site**}{number of study sites}
#' \item{**pval_n_detected**}{sites with the same p-value or lower}
#' \item{**pval_fp**}{expected number of fp, pval * n_site}
#' \item{**pval_p_vs_fp_ratio**}{odds under-reporting as p/fp, poisson.test (use as benchmark)}
#' \item{**pval_prob_ur**}{probability under-reporting as 1 - fp/p, poisson.test (use as benchmark)}
#' \item{**prob_low_n_detected**}{sites with same bootstrapped probability or lower}
#' \item{**prob_low_fp**}{expected number of fp, prob_lower * n_site}
#' \item{**prob_low_p_vs_fp_ratio**}{odds under-reporting as p/fp, bootstrapped (use)}
#' \item{**prob_low_prob_ur**}{probability under-reporting as 1 - fp/p, bootstrapped (use)}
#' }
#' @details If by chance expected number of
#' false positives (fp) is greater than the total number of positives (p) we
#' set p_vs_fp_ratio = 1 and prob_ur = 0.
#' @examples
#' df_visit <- sim_test_data_study(n_pat = 100, n_sites = 5,
#' frac_site_with_ur = 0.4, ur_rate = 0.6)
#'
#' df_visit$study_id <- "A"
#' df_site <- site_aggr(df_visit)
#'
#' df_sim_sites <- sim_sites(df_site, df_visit, r = 100)
#'
#' df_eval <- eval_sites_deprecated(df_sim_sites, r_sim_sites = 100)
#' df_eval
#' @rdname eval_sites_deprecated
#' @seealso [site_aggr()][site_aggr()], [sim_sites()][sim_sites()]
#' @export
eval_sites_deprecated <- function(df_sim_sites,
r_sim_sites) {


df_out <- df_sim_sites

if ("pval" %in% names(df_out)) {

if (anyNA(df_out$pval)) {
warning("pval column contains NA")
}

df_out <- df_out %>%
group_by(.data$study_id) %>%
arrange(.data$study_id, .data$pval) %>%
mutate(
n_site = n(),
min_pval = min(.data$pval[.data$pval > 0]),
# we need to use ties.method max because there can be more than one site with a
# specific p value
pval_n_detected = rank(.data$pval, ties.method = "max", na.last = "keep"),
pval_fp = .data$pval * .data$n_site,
pval_fp = ifelse(.data$pval_fp == 0, .data$min_pval / 10, .data$pval_fp),
pval_p_vs_fp_ratio = .data$pval_n_detected / .data$pval_fp,
pval_p_vs_fp_ratio = ifelse(is.na(.data$pval_p_vs_fp_ratio) & ! is.na(.data$pval),
0, .data$pval_p_vs_fp_ratio),
# it is possible to get ratios lower than one if p < expected fp
# this can happen by chance and is meaningless
pval_p_vs_fp_ratio = ifelse(.data$pval_p_vs_fp_ratio < 1, 1, .data$pval_p_vs_fp_ratio),
pval_prob_ur = 1 - 1 / .data$pval_p_vs_fp_ratio
) %>%
select(- "min_pval")
}

if ("prob_low" %in% names(df_out)) {

if (anyNA(df_out$pval)) {
warning("prob_lower column contains NA")
}

df_out <- df_out %>%
group_by(.data$study_id) %>%
arrange(.data$study_id, .data$prob_low) %>%
mutate(
n_site = n(),
# we need to use ties.method max because there can be more than one site with a
# specific prob_low value
prob_low_n_detected = rank(.data$prob_low, ties.method = "max", na.last = "keep"),
prob_low_fp = .data$prob_low * .data$n_site,
prob_low_fp = ifelse(.data$prob_low_fp == 0, 1 / r_sim_sites / 10, .data$prob_low_fp),
prob_low_p_vs_fp_ratio = .data$prob_low_n_detected / .data$prob_low_fp,
prob_low_p_vs_fp_ratio = ifelse(is.na(.data$prob_low_p_vs_fp_ratio) & ! is.na(.data$prob_low),
0, .data$prob_low_p_vs_fp_ratio),
# it is possible to get ratios lower than one if p < expected fp
# this can happen by chance and is meaningless
prob_low_p_vs_fp_ratio = ifelse(.data$prob_low_p_vs_fp_ratio < 1, 1, .data$prob_low_p_vs_fp_ratio),
prob_low_prob_ur = 1 - 1 / .data$prob_low_p_vs_fp_ratio
)
}

return(ungroup(df_out))
}



Expand Down
3 changes: 0 additions & 3 deletions man/eval_sites.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

60 changes: 0 additions & 60 deletions man/eval_sites_deprecated.Rd

This file was deleted.

3 changes: 3 additions & 0 deletions man/simaerep.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

18 changes: 18 additions & 0 deletions tests/testthat/test_S3_simaerep.R
Original file line number Diff line number Diff line change
Expand Up @@ -90,3 +90,21 @@ test_that("plot.simaerep throws error when original visit data cannot be retriev
"ggplot"
)
})

test_that("simaerep() with mult_corr = FALSE must not return adjusted probabilities", {

aerep <- simaerep(df_visit_test, mult_corr = FALSE)
expect_true(! "prob_low_prob_ur" %in% colnames(aerep$df_eval))

aerep <- simaerep(df_visit_test, mult_corr = FALSE, under_only = FALSE)
expect_true(! "prob_low_prob_ur" %in% colnames(aerep$df_eval))
expect_true(! "prob_high_prob_or" %in% colnames(aerep$df_eval))

aerep <- simaerep(df_visit_test, mult_corr = FALSE, inframe = TRUE)
expect_true(! "prob_low_prob_ur" %in% colnames(aerep$df_eval))

aerep <- simaerep(df_visit_test, mult_corr = FALSE, under_only = FALSE, inframe = TRUE)
expect_true(! "prob_low_prob_ur" %in% colnames(aerep$df_eval))
expect_true(! "prob_high_prob_or" %in% colnames(aerep$df_eval))

})
20 changes: 0 additions & 20 deletions tests/testthat/test_eval_sites.R
Original file line number Diff line number Diff line change
Expand Up @@ -52,23 +52,3 @@ test_that("eval_sites() - check column names of returned data frame", {
all()
)
})

test_that(
paste(
"eval_sites() with method = NULL uses deprecated method for p value correction.",
"Must throw warning.",
"Stats from sim_sites() must negatively correlate with p/fp ratio.",
"p/fp ratio must not be smaller 1."
), {

expect_warning({
df_eval <- eval_sites(df_sim_sites_test, method = NULL, r_sim_sites = 100)
})

expect_true(cor(df_eval$prob_low, df_eval$prob_low_p_vs_fp_ratio) < 0)
expect_true(cor(df_eval$pval, df_eval$pval_p_vs_fp_ratio) < 0)

expect_true(! any(df_eval$prob_low_p_vs_fp_ratio < 1))
expect_true(! any(df_eval$pval_p_vs_fp_ratio < 1))

})
2 changes: 1 addition & 1 deletion vignettes/performance.Rmd
Original file line number Diff line number Diff line change
Expand Up @@ -369,7 +369,7 @@ in past experiments.
- multiplicity correction imposes a penalty on the true positive rate

> This observation was already made by the Boeringer Ingelheim Team during the evaluation of {simaerep}. We can
now reporducibly confirm this. The unaltered probability score as returned by the bootstrap algorithm already provides
now reproducibly confirm this. The unaltered probability score as returned by the bootstrap algorithm already provides
very realistic under-reporting probabilities.

- {simaerep} outperforms simpler methods such as funnel plot and box plot outlier detection.
Expand Down

0 comments on commit 5bdd121

Please sign in to comment.