Skip to content

Commit

Permalink
outline for over-reporting score
Browse files Browse the repository at this point in the history
  • Loading branch information
erblast committed Feb 27, 2024
1 parent 2f4f43a commit ae94573
Show file tree
Hide file tree
Showing 6 changed files with 262 additions and 16 deletions.
2 changes: 1 addition & 1 deletion DESCRIPTION
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
Package: simaerep
Title: Find Clinical Trial Sites Under-Reporting Adverse Events
Version: 0.4.4
Version: 0.4.3.900
Authors@R: c(
person(given = "Bjoern",
family = "Koneswarakantha",
Expand Down
3 changes: 2 additions & 1 deletion NEWS.md
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
# simaerep 0.4.4
# simaerep 0.5.0
- allow flexible AE rates in data simulations
- add vignette comparing simaerep to gsm performance
- add over-reporting probability
- fix dplyr warnings
- fix warnings around ggplot and cowplot

Expand Down
12 changes: 12 additions & 0 deletions R/simaerep.R
Original file line number Diff line number Diff line change
Expand Up @@ -347,6 +347,7 @@ get_visit_med75 <- function(df_pat,
#' @export
eval_sites <- function(df_sim_sites,
method = "BH",
under_only = TRUE,
...) {


Expand Down Expand Up @@ -406,6 +407,17 @@ eval_sites <- function(df_sim_sites,
prob_low_adj = p.adjust(.data$prob_low, method = method),
prob_low_prob_ur = 1 - .data$prob_low_adj
)

if (! under_only) {
df_out <- df_out %>%
mutate(prob_high = 1 - prob_low) %>%
arrange(.data$study_id, .data$prob_high) %>%
mutate(
prob_high_adj = p.adjust(.data$prob_high, method = method),
prob_high_prob_or = 1 - .data$prob_high_adj
)
}

}

return(ungroup(df_out))
Expand Down
26 changes: 14 additions & 12 deletions R/simaerep_plot.R
Original file line number Diff line number Diff line change
Expand Up @@ -440,7 +440,9 @@ plot_study <- function(df_visit,
study,
df_al = NULL,
n_sites = 16,
pval = FALSE) {
pval = FALSE,
prob_col = "prob_low_prob_ur") {

# TODO: parametrize scores, fix legend

df_visit <- check_df_visit(df_visit)
Expand Down Expand Up @@ -496,18 +498,18 @@ plot_study <- function(df_visit,
# ordered sites -------------------------------------------------------------

n_site_ur_gr_0p5 <- df_eval %>%
filter(.data$prob_low_prob_ur > 0.5) %>%
filter(.data[[prob_col]] > 0.5) %>%
nrow()

if (n_site_ur_gr_0p5 > 0) {
sites_ordered <- df_eval %>%
arrange(.data$study_id, desc(.data$prob_low_prob_ur)) %>%
filter(.data$prob_low_prob_ur > 0.5) %>%
arrange(.data$study_id, desc(.data[[prob_col]])) %>%
filter(.data[[prob_col]] > 0.5) %>%
head(n_sites) %>%
.$site_number
} else {
sites_ordered <- df_eval %>%
arrange(.data$study_id, desc(.data$prob_low_prob_ur)) %>%
arrange(.data$study_id, desc(.data[[prob_col]])) %>%
head(6) %>%
.$site_number
}
Expand Down Expand Up @@ -564,16 +566,16 @@ plot_study <- function(df_visit,
# define score cut-offs + labels----------------------------------------------

palette <- RColorBrewer::brewer.pal(9, "Blues")[c(3, 5, 7, 9)]
breaks <- c(0, 0.5, 0.75, 0.95, ifelse(max(df_eval$prob_low_prob_ur, na.rm = TRUE) > 0.95,
max(df_eval$prob_low_prob_ur, na.rm = TRUE) + 0.1,
breaks <- c(0, 0.5, 0.75, 0.95, ifelse(max(df_eval[[prob_col]], na.rm = TRUE) > 0.95,
max(df_eval[[prob_col]], na.rm = TRUE) + 0.1,
NA)
)

breaks <- breaks[! is.na(breaks)]

df_eval <- df_eval %>%
mutate(
prob_cut = cut(.data$prob_low_prob_ur, breaks = breaks, include.lowest = TRUE),
prob_cut = cut(.data[[prob_col]], breaks = breaks, include.lowest = TRUE),
color_prob_cut = palette[as.numeric(.data$prob_cut)]
)

Expand Down Expand Up @@ -617,19 +619,19 @@ plot_study <- function(df_visit,
, by = c("study_id", "site_number")
) %>%
left_join(df_alert, by = c("study_id", "site_number")) %>%
select(c(
select(any_of(c(
"study_id",
"site_number",
"visit",
"mean_ae",
"n_pat",
"n_pat_with_med75",
"prob_low_prob_ur",
prob_col,
"prob_cut",
"color_prob_cut",
"pval_prob_ur",
"alert_level_site"
)) %>%
))) %>%
mutate(
site_number = fct_relevel(.data$site_number, sites_ordered),
color_alert_level = case_when(
Expand Down Expand Up @@ -739,7 +741,7 @@ plot_study <- function(df_visit,
x = 0.2 * max_visit,
y = 0.9 * max_ae,
na.rm = TRUE) +
geom_label(aes(label = paste(round(prob_low_prob_ur, 3) * 100, "%"),
geom_label(aes(label = paste(round(.data[[prob_col]], 3) * 100, "%"),
color = color_prob_cut),
data = df_label,
x = 0.8 * max_visit,
Expand Down
4 changes: 2 additions & 2 deletions man/simaerep.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

231 changes: 231 additions & 0 deletions vignettes/over.Rmd
Original file line number Diff line number Diff line change
@@ -0,0 +1,231 @@
---
title: "Over-Reporting Probability"
output:
html_document:
toc: true
toc_depth: 3
toc_float: true
number_sections: true
code_folding: show
collapse: false
editor_options:
chunk_output_type: console
---

```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE, cache = FALSE, fig.width = 10)
```

# Load
```{r}
suppressPackageStartupMessages(library(dplyr))
suppressPackageStartupMessages(library(tidyr))
suppressPackageStartupMessages(library(simaerep))
suppressPackageStartupMessages(library(ggExtra))
```

# Introduction

{siamerep} was originally created to detect under-reporting of AEs and therefore no over-reporting probability was calculated. Nevertheless {simaerep} can theoretically be used to simulate all kinds of subject-based clinical events, for some such as issues over-reporting can represent a quality issue. With the recent release `0.5.0` we have added the option to calculate an over-reporting probability score.

# Data Set

We simulate a standard data set with a high number of sites, patients, visits and events to ensure that most of our dimensions will be normally distributed. We do not add any over- or under-reporting sites at this point.

```{r}
df_visit <- sim_test_data_study(
n_pat = 10000,
n_sites = 1000,
frac_site_with_ur = 0,
max_visit_mean = 100,
max_visit_sd = 1,
ae_per_visit_mean = 5
)
df_visit$study_id <- "A"
```

# Run {simaerep}

in order to add the over-reporting probability score we need to set the parameter `under_only = FALSE`.

```{r}
system.time(
aerep_def <- simaerep(
df_visit,
param_sim_sites = list(under_only = TRUE),
param_eval_sites = list(under_only = TRUE)
)
)
```

The original setting skips the simulation for all sites that do have more AEs than the study average.

```{r}
system.time(
aerep_ovr <- simaerep(
df_visit,
param_sim_sites = list(under_only = FALSE),
param_eval_sites = list(under_only = FALSE)
)
)
```

The new parameter calculates the probability of a site getting `a lower or equal` average AE count for the site visit_med75 for every site, regardless of how its initial value compares to the study average. The calculation only takes a few seconds longer than the default setting.

In the evaluation data frame we have three more columns available now.

```{r}
setdiff(colnames(aerep_ovr$df_eval), colnames(aerep_def$df_eval))
```

# Analyze

## Probability getting a lower or equal AE count


```{r}
cols <- c("study_id", "site_number", "mean_ae_site_med75", "mean_ae_study_med75", "prob_low")
p <- bind_rows(
select(
aerep_ovr$df_eval,
all_of(cols)
) %>%
mutate(type = "with over-reporting"),
select(
aerep_def$df_eval,
all_of(cols)
) %>%
mutate(type = "default")
) %>%
ggplot(aes(x = mean_ae_site_med75 - mean_ae_study_med75, y = prob_low, color = type)) +
geom_point(alpha = 0.5) +
theme(legend.position = "bottom") +
scale_color_manual(values = c("gold", "blue")) +
labs(y = "Probability of getting a lower or equal mean site AE in a 1000x simulation")
ggExtra::ggMarginal(p, groupColour = TRUE, type = "density")
```


We can see that we have a gap for the default setting in the generated probabilities. The values filling the gap can be interpreted as the probability of having a `higher` site average than originally observed.

We need to be aware that values in the simulation `equal` to the site mean will count towards the under-reporting probability. This can potentially lead to artifacts with low AE counts.

## Over-Reporting

We can add the over-reporting probability as (1- under-reporting probability)

```{r}
cols <- c("study_id", "site_number", "mean_ae_site_med75", "mean_ae_study_med75")
p <- bind_rows(
select(
aerep_ovr$df_eval,
all_of(cols),
value = "prob_low"
) %>%
mutate(type = "new under-reporting"),
select(
aerep_ovr$df_eval,
all_of(cols),
value = "prob_high"
) %>%
mutate(type = "new over-reporting"),
select(
aerep_def$df_eval,
all_of(cols),
value = "prob_low"
) %>%
mutate(type = "default under-reporting")
) %>%
ggplot(aes(x = mean_ae_site_med75 - mean_ae_study_med75, y = value, color = type)) +
geom_point(alpha = 0.25) +
theme(legend.position = "bottom") +
scale_color_manual(values = c("gold", "purple", "blue")) +
labs(y = "Probability of getting a lower or equal mean site AE in a 1000x simulation")
ggExtra::ggMarginal(p, groupColour = TRUE, type = "density")
```

## Multiplicity Correction

The multiplicity correction dampens the signal, avoiding false positives that are the result of chance.

```{r}
cols <- c("study_id", "site_number", "mean_ae_site_med75", "mean_ae_study_med75")
p <- bind_rows(
select(
aerep_ovr$df_eval,
all_of(cols),
value = "prob_low_prob_ur"
) %>%
mutate(type = "new under-reporting"),
select(
aerep_ovr$df_eval,
all_of(cols),
value = "prob_high_prob_or"
) %>%
mutate(type = "new over-reporting"),
select(
aerep_def$df_eval,
all_of(cols),
value = "prob_low_prob_ur"
) %>%
mutate(type = "default under-reporting")
) %>%
ggplot(aes(x = mean_ae_site_med75 - mean_ae_study_med75, y = value, color = type)) +
geom_point(alpha = 0.25) +
theme(legend.position = "bottom") +
scale_color_manual(values = c("gold", "purple", "blue")) +
labs(y = "Probability of getting a lower or equal mean site AE in a 1000x simulation")
ggExtra::ggMarginal(p, groupColour = TRUE, type = "density")
```

# Simulating Over-Reporting

We can simulate under-reporting by supplying a negative ratio for `ur_rate`

```{r}
df_visit <- sim_test_data_study(
frac_site_with_ur = 0.05,
ur_rate = - 0.5,
)
df_visit$study_id <- "A"
distinct(df_visit, site_number, is_ur, ae_per_visit_mean)
```


```{r}
aerep <- simaerep(
df_visit,
param_sim_sites = list(under_only = FALSE),
param_eval_sites = list(under_only = FALSE)
)
```

```{r}
aerep$df_eval %>%
select(site_number, mean_ae_site_med75, mean_ae_study_med75, prob_low_prob_ur, prob_high_prob_or)
```

We can plot over-reporting by changing setting `prob_col = "prob_high_prob_or"`.

```{r}
plot(aerep, prob_col = "prob_high_prob_or")
plot(aerep, prob_col = "prob_low_prob_ur")
```

0 comments on commit ae94573

Please sign in to comment.