-
Notifications
You must be signed in to change notification settings - Fork 2
Making Formatted Tables
In this tutorial we cover the creation of formatted tables containing one row per scenarios and one formatted column per outcome of interest.
The outcomes are reported with their median over all replications and the 95% simulation intervals.
scenario_name | outcome1 | outcome2 | ... |
---|---|---|---|
sc_1 | 10 (4, 17) | 73 (50, 85) | ... |
... | ... | ... | ... |
sc_14 | 1024 (800, 1402) | 39 (20, 60) | ... |
For this tutorial, we assume that the scenarios have already been processed into
a tibble
containing one row per simulation and one column per outcome of
interest. See the Processing Scenarios Tibbles wiki page.
The functions required for this processing are provided at the end as they don't need to be altered. We will cover how to use them first and their code later on.
First we source
the code in R/utils-format.R
and load the CSV file containing
the raw results.
source("R/utils-format.R")
d_sc_raw <- readr::read_csv("sc_raw.csv")
glimpse(d_sc_raw)
#>
#> Rows: 1,994
#> Columns: 15
#> $ scenario_name <chr> "0_no_doxy", "0_no_doxy", "0_no_doxy", "0_no_doxy", "0_
#> $ batch_number <chr> "1", "1", "1", "1", "1", "1", "1", "1", "2", "2", "2",
#> $ sim <int> 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3
#> $ ir100.B_ly <dbl> 1.661293, 1.698790, 1.752273, 1.801898, 1.673978, 1.768
#> $ ir100.H_ly <dbl> 0.4823359, 0.4383548, 0.6653676, 0.3444499, 0.5919497,
#> $ ir100.W_ly <dbl> 0.2822352, 0.3250677, 0.1847902, 0.2113912, 0.2861212,
#> $ prepElig.B_ly <dbl> 23381.56, 23510.79, 22996.17, 22989.04, 23430.38, 23426
#> $ prepElig.H_ly <dbl> 3075.115, 3142.442, 3073.231, 3009.269, 3056.654, 3013.
#> $ prepElig.W_ly <dbl> 30986.40, 30876.50, 30807.92, 30764.06, 31042.63, 30835
#> $ i.prev.B_ly <dbl> 0.2952991, 0.2944744, 0.3025195, 0.2960906, 0.2947840,
#> $ i.prev.H_ly <dbl> 0.09360180, 0.10228400, 0.10794107, 0.10434721, 0.10131
#> $ i.prev.W_ly <dbl> 0.05904783, 0.06204328, 0.06055708, 0.06148030, 0.06145
#> $ incid.B_cml <dbl> 696, 705, 726, 747, 694, 746, 698, 682, 714, 685, 652,
#> $ incid.H_cml <dbl> 24, 22, 33, 17, 30, 31, 18, 27, 31, 20, 33, 26, 24, 23,
#> $ incid.W_cml <dbl> 143, 165, 92, 108, 144, 128, 131, 140, 127, 130, 135, 1
The outcome variables are usually compact R
syntactic names. For our table
we will want cleaner and more explicit names.
For this we define a var_labels
variable. A named character
vector where the
names correspond to the column names, and the values are the string to replace
them with.
var_labels <- c(
"i.prev.B_ly" = "HIV Prevalence Black (last year)",
"i.prev.H_ly" = "HIV Prevalence Hispanic (last year)",
"i.prev.W_ly" = "HIV Prevalence White (last year)",
"ir100.B_ly" = "HIV Incidence per 100 PYAR Black (last year)",
"ir100.H_ly" = "HIV Incidence per 100 PYAR Hispanic (last year)",
"ir100.W_ly" = "HIV Incidence per 100 PYAR White (last year)",
"prepElig.B_ly" = "Mean Number of PrEP Eligibles Black (last year)",
"prepElig.H_ly" = "Mean Number of PrEP Eligibles Hispanic (last year)",
"prepElig.W_ly" = "Mean Number of PrEP Eligibles White (last year)",
"incid.B_cml" = "HIV Cumulative Incidence Black (10y)",
"incid.H_cml" = "HIV Cumulative Incidence Hispanic (10y)",
"incid.W_cml" = "HIV Cumulative Incidence White (10y)"
)
Next, we want to print the number in a cleaner fashion. For this we use the
scales package . It provides a set of label_
functions creating number formatters.
In this example, we use scales::label_number(0.01)
for small number where we
want 2 decimal digits, scales::label_percent(0.1)
for percents that are to be
formatted like 43.2%
and scales::label_number(1)
for number without decimal
digits.
The format_patterns
list, contains one sublist per formatter we want to use.
The patterns
element is a character
vector of patterns to match against.
The fun
element is the formatter of interest.
format_patterns <- list(
small_num = list(
patterns = c("^ir100"),
fun = scales::label_number(0.01)
),
perc = list(
patterns = c("^i.prev"),
fun = scales::label_percent(0.1)
),
# formatter with a catch all pattern. Must be last.
default = list(
patterns = ".*",
fun = scales::label_number(1)
)
)
Above, the small_fun
sublist will match all outcome variable starting with
ir100
and use the scales::label_number(0.01)
formatter.
Multiple patterns could be use per formatter. The matching is done with
str_detect
.
The last element, default
uses a catch all pattern ".*"
. So by default, all
outcomes will use this formatter. In this case, the scales::label_number(1)
.
We are basically done. The format_table
function provided takes the raw
tibble
, the labels and the format pattern as input and does all the formatting.
d_table <- format_table(d_sc_raw, var_labels, format_patterns)
readr::write_csv(d_table, "table.csv")
d_table
#> # A tibble: 19 × 13
#> scenario_name HIV Prevalence Black (last ye…¹ HIV Prevalence Hispa…²
#> <chr> <chr> <chr>
#> 1 0_no_doxy 29.5% (28.7%, 30.2%) 10.3% (9.4%, 10.9%)
#> 2 t2_3_prep_poz_1_placebo 29.5% (28.7%, 30.5%) 10.2% (9.3%, 11.2%)
#> 3 t2_4_1sti_12m_1.2 30.5% (30.2%, 30.8%) 10.6% (10.6%, 10.7%)
#> 4 t2_5_2sti_12m_1 29.4% (29.0%, 30.0%) 10.1% (9.3%, 10.9%)
#> 5 t2_5_2sti_12m_1_placebo 29.7% (28.8%, 30.6%) 10.2% (9.5%, 11.0%)
#> 6 t3_0_dumb_base 29.4% (28.5%, 30.1%) 10.0% (9.3%, 10.8%)
#> 7 t3_0_dumb_base_nsc 29.3% (28.7%, 30.0%) 10.0% (9.3%, 10.8%)
#> 8 t3_1_dumb_var_gc_0.2 29.4% (28.4%, 30.1%) 10.0% (9.2%, 10.9%)
#> 9 t3_1_dumb_var_gc_5 29.4% (28.7%, 30.2%) 10.0% (9.2%, 10.9%)
#> 10 t3_1_dumb_var_gc_nsc_0.2 29.4% (28.8%, 30.1%) 10.1% (9.1%, 11.1%)
#> 11 t3_1_dumb_var_gc_nsc_5 29.5% (28.8%, 30.1%) 10.1% (9.3%, 11.0%)
#> 12 t3_2_dumb_var_ct_0.2 29.3% (28.6%, 30.2%) 10.1% (9.3%, 11.0%)
#> 13 t3_2_dumb_var_ct_5 29.4% (28.6%, 30.1%) 10.0% (9.1%, 10.9%)
#> 14 t3_2_dumb_var_ct_nsc_0.2 29.4% (28.7%, 30.0%) 10.0% (9.2%, 10.9%)
#> 15 t3_2_dumb_var_ct_nsc_5 29.5% (28.7%, 30.1%) 10.2% (9.3%, 10.8%)
#> 16 t3_3_dumb_var_sti_0.2 29.4% (28.7%, 30.0%) 10.0% (9.2%, 11.0%)
#> 17 t3_3_dumb_var_sti_5 29.5% (28.7%, 30.1%) 10.0% (9.3%, 10.9%)
#> 18 t3_3_dumb_var_sti_nsc_0.2 29.4% (28.7%, 30.1%) 10.1% (9.2%, 11.0%)
#> 19 t3_3_dumb_var_sti_nsc_5 29.5% (28.6%, 30.2%) 10.1% (9.4%, 10.9%)
The raw CSV and the formatted table are both available here
The format_table
function will only keep the outcomes which are present in
var_labels
. Therefore, you can split your tables by splitting var_labels
into
sub pieces and calling format_table
multiple times
The format_table
function reorder the column to match the order in
var_labels
. This allows a better parameterization of the outputed table.
This function gives you an exploitable CSV that is useful to assess results but not publication ready.
A good way to bridge the gap is to make a nice table skeleton in Excel or Google Sheets and to paste the values cells from the CSV. This way, when the simulations are run again, we can simply paste the formatted values.
This section simply contains the code for format_table
and other helper
functions.
You can simply paste it in a R/utils-format.R
script to have it available.
Feel free to inspect the code if you need to iterate on it for some specific purposes.
### utils-format.R
library(dplyr)
library(tidyr)
format_table <- function(d, var_labels, format_patterns) {
formatters <- make_formatters(var_labels, format_patterns)
d_out <- d |>
sum_quants(0.025, 0.5, 0.975) |>
pivot_longer(-scenario_name) |>
separate(name, into = c("name", "quantile"), sep = "_/_") |>
pivot_wider(names_from = quantile, values_from = value) |>
filter(name %in% names(var_labels)) |>
mutate(
clean_val = purrr::pmap_chr(
list(name, l, m, h),
~ common_format(formatters, ..1, ..2, ..3, ..4))
) |>
select(-c(l, m, h)) |>
mutate(
name = var_labels[name]
) |>
pivot_wider(names_from = name, values_from = clean_val) |>
arrange(scenario_name)
reorder_cols(d_out, var_labels)
}
make_formatters <- function(var_labels, format_patterns) {
fmts <- vector(mode = "list", length = length(var_labels))
for (nms in names(var_labels)) {
for (fp in format_patterns) {
if (any(stringr::str_detect(nms, fp$patterns))) {
fmts[[nms]] <- fp$fun
break()
}
}
}
fmts
}
sum_quants <- function(d, ql = 0.025, qm = 0.5, qh = 0.975) {
d |>
ungroup() |>
select(-c(batch_number, sim)) |>
group_by(scenario_name) |>
summarise(across(
everything(),
list(
l = ~ quantile(.x, ql, na.rm = TRUE),
m = ~ quantile(.x, qm, na.rm = TRUE),
h = ~ quantile(.x, qh, na.rm = TRUE)
),
.names = "{.col}_/_{.fn}"
),
.groups = "drop"
)
}
reorder_cols <- function(d, var_labels) {
missing_cols <- setdiff(names(d), var_labels)
cols_order <- c(missing_cols, intersect(var_labels, names(d)))
d[, cols_order]
}
common_format <- function(formatters, name, ql, qm, qh) {
if (is.na(qm)) {
"-"
} else {
paste0(
formatters[[name]](qm), " (", formatters[[name]](ql),
", ", formatters[[name]](qh), ")"
)
}
}