-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy path03_summarization.R
55 lines (45 loc) · 1.65 KB
/
03_summarization.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
# aggregating file from student-level to school-level
library(tidyverse)
y8s <- read_rds("processed_data/year8_students.rds")
# Helper functions for the summarization step
summary_fns <- list(
mean = function(x) suppressWarnings(mean(x, na.rm = TRUE)),
valid = function(x) sum(!is.na(x)),
missing = function(x) sum(is.na(x)),
se = function(x) {
n_valid <- sum(!is.na(x))
if (n_valid < 2) return(NA)
return(sd(x, na.rm = TRUE) / sqrt(n_valid))
}
)
get_most_common <- function(x) names(sort(table(x), decreasing = TRUE)[1])
get_first <- function(x) first(x, na_rm = TRUE)
# actually do the summarization (this takes a while)
school_data <-
y8s |>
mutate(school_id = paste(WPOBRIN_crypt, WPOBRINVEST, sep = "_")) |>
summarise(
# for person characteristics, use the mean
across(
.cols = c(starts_with("outcome"), starts_with("control"), -starts_with("control_school"), -control_testtype),
.fns = summary_fns,
.names = "{.col}_{.fn}"
),
# for school characteristics, get the first value
across(c(starts_with("control_school"), starts_with("intervention")), get_first),
# for testtype, get the most common value
control_testtype_mostcommon = get_most_common(control_testtype),
control_schoolsize = n(),
.by = c("school_id", "peiljaar")
) |>
mutate(
across(c(control_schooldenom, control_testtype_mostcommon), as.factor)
)
# clean up and store
school_data <-
school_data |>
select(
school_id, peiljaar, intervention, intervention_hours,
starts_with("control"), starts_with("outcome")
)
write_rds(school_data, "processed_data/school_data.rds")