-
Notifications
You must be signed in to change notification settings - Fork 8
/
Copy pathfiles.R
171 lines (129 loc) · 5.28 KB
/
files.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
library(tidyverse)
library(magrittr)
library(broom)
# want to show filter, arrange, group_by, summarise, pivoting, select
# if you feel comfortable using the pipe, feel free to do so
library(mosaicData)
data(Gestation)
# how many observations in the data frame?
count(Gestation)
# how many observations for each race?
count(Gestation, race)
# how many for each race and educational attainment?
count(Gestation, race, ed)
# that's a bit messy, so let's widen it into a table
pivot_wider(count(Gestation, race, ed),
names_from = race,
values_from = n)
# or for pipe users
Gestation %>%
count(race, ed) %>%
pivot_wider(names_from = race,
values_from = n)
# those NA values are really 0s because there are zero cases recorded
pivot_wider(count(Gestation, race, ed),
names_from = race,
values_from = n,
values_fill = 0)
# what's the mean age in the data set?
summarise(Gestation, age = mean(age, na.rm = T))
# what's the mean age for each race group?
summarise(group_by(Gestation, race),
mean = mean(age, na.rm = T))
# what's the mean and SD of age for each race group?
summarise(group_by(Gestation, race),
mean = mean(age, na.rm = T),
sd = sd(age, na.rm = T))
# or, alternatively
summarise_at(group_by(Gestation, race),
.vars = vars(age),
.funs = list(Mean = mean, SD = sd),
na.rm = T)
# let's reorder in order of frequency
Gestation <- mutate(Gestation,
race2 = factor(race),
race2 = fct_explicit_na(f = race2,
na_level = "unknown"))
count(Gestation, race2)
# to make even more friendly, copy and paste the following
Gestation <- mutate(Gestation,
race2 = stringr::str_to_title(race2),
race2 = forcats::fct_infreq(race2),
race2 = forcats::fct_relevel(race2, "Unknown", after = Inf))
# we would probably recognise Mexican as Hispanic these days in US data sources
Gestation <- mutate(Gestation,
race2 = forcats::fct_recode(race2, "Hispanic" = "Mex"))
count(Gestation, race2)
Gestation_age_race_unknown <- lm(data = Gestation, age ~ race2)
tidy(Gestation_age_race_unknown)
glance(Gestation_age_race_unknown)
Gestation_age_race <- lm(data = Gestation, age ~ race)
tidy(Gestation_age_race)
glance(Gestation_age_race)
# our conclusion changes based on whether or not we include the 'unknown' group
# an alternative way to fit this would be to filter out all `Unknown` so that we still have human-friendly labels
Gestation %>%
filter(race2 != "Unknown") %>%
lm(data = ., formula = age ~ race) %>%
glance
## we can use summary statistics that require more than one variable so long as they return a scalar, e.g. the sample correlation
Gestation %>%
group_by(race2) %>%
summarise(`r(wt, age)` = cor(wt, age, use = "pairwise.complete.obs"))
## What if we want to look at more than one correlation
Gestation %>%
group_by(race2) %>%
summarise(r = cor(wt, age, gestation, use = "pairwise.complete.obs")) # will fail
## we need to get a
Gestation %>%
select(wt, age, gestation) %>%
cor(use = "pairwise") %>%
as.data.frame %>%
tibble::rownames_to_column(var = "row") %>%
pivot_longer(-row, names_to = "col")
# fancy version!!
Gestation %>%
select(race2, wt, age, gestation) %>%
nest(data = -race2) %>%
mutate(C = map(data, cor, use = "pairwise")) %>%
mutate(C_ = map(C, upper.tri)) %>%
mutate(C_long = map(C, function(x){data.frame(x) %>%
tibble::rownames_to_column("row") %>%
pivot_longer(-row, "col")})) %>%
unnest(C_long) %>%
select(-data, -C) %>%
filter(!(row == col))
# do a model for birth weight as a function of smoking
Gestation %<>%
filter(!is.na(race)) %>%
replace_na(list(number = "non-zero")) %>%
mutate(number = ifelse(number == "never", "0 per day", number)) %>%
mutate(number_ = stringr::str_extract(number, pattern = "^[0-9]{1,2}"),
number_ = parse_number(number_)) %>%
mutate(number = fct_reorder(number, number_, min))
Gestation %>%
glm(data = ., wt ~ number, family = Gamma) %>%
tidy(conf.int = T) %>%
mutate(term = sub(pattern = "^number", replacement = "", x = term)) %>%
mutate(term = sub("(Intercept)", "Never", term, fixed = T))
Gestation %>%
lm(data = ., wt ~ number) %>%
{bind_cols(distinct(Gestation, number),
predict(object = .,
newdata = distinct(Gestation, number),
interval = "conf") %>% data.frame)} %>%
ggplot(data = ., aes(x = number, y = fit)) +
geom_pointrange(aes(ymin = lwr,
ymax = upr))
###
Gestation %>%
mutate(smoke = str_to_sentence(smoke)) %>%
mutate(smoke = fct_explicit_na(f = smoke, "Smoking unknown")) %>%
mutate(smoke = factor(smoke,
levels = c("Never", "Once did, not now",
"Until current pregnancy",
"Now",
"Smoking unknown"))) %>%
group_by(race2, `Smoking status` = smoke) %>%
summarise(Mean = mean(wt, na.rm = T)) %>%
pivot_wider(names_from = "race2", values_from = "Mean")