-
Notifications
You must be signed in to change notification settings - Fork 49
/
Training-Example1.R
186 lines (120 loc) · 5.46 KB
/
Training-Example1.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
hospitals <- read.table("https://bit.ly/2qZwdMn", header =T)
# remove the first column
hospitals <- hospitals[,-1]
# Create factors variables
hospitals$Region <- as.factor(hospitals$Region)
hospitals$Med.school <- as.factor(hospitals$Med.school)
attach(hospitals)
# remove categorical variables for the purposes of graphing
hospitals_num <- hospitals[,c(-7,-8)]
pairs(hospitals_num)
library(psych)
# create a scatter plot matrix
pairs.panels(hospitals_num)
# Better maybe to use
panel.cor <- function(x, y, digits=2, prefix="", cex.cor)
{
usr <- par("usr"); on.exit(par(usr))
par(usr = c(0, 1, 0, 1))
r <- abs(cor(x, y))
txt <- format(c(r, 0.123456789), digits=digits)[1]
txt <- paste(prefix, txt, sep="")
if(missing(cex.cor))
cex <- 0.8/strwidth(txt)
test <- cor.test(x,y)
# borrowed from printCoefmat
Signif <- symnum(test$p.value, corr = FALSE, na = FALSE,
cutpoints = c(0, 0.001, 0.01, 0.05, 0.1, 1),
symbols = c("***", "**", "*", ".", " "))
text(0.5, 0.5, txt, cex = cex * r)
text(.8, .8, Signif, cex=cex, col=2)
}
pairs(hospitals_num, upper.panel = panel.cor)
lm <- lm(Infection.risk ~ hospitals_num$Culture, data = hospitals_num)
summary(lm)
# How well your model fits the data?
# Which other variables would you use to predicit infection risk using Multiple Linear Regression?
lm1 <- lm(Infection.risk ~ . , data = hospitals)
summary(lm1)
# Facilities, Culture, Length.of.stay
# Select significant variables and build your multiple
lm2 <- lm(Infection.risk ~ hospitals_num$Length.of.stay + hospitals_num$Culture + hospitals_num$Facilities+hospitals$X.ray, data = hospitals)
summary(lm2)
# Q6 Are the infection risks in us Regions different? Write your Hypothesis and provide significant tests.
#
# H_0 is that mean of all group are the same
# H_1 at least one mean that is different
m<-aov(Infection.risk~Region, data=hospitals)
summary(m)
# Df Sum Sq Mean Sq F value Pr(>F)
# Region 3 14.0 4.666 2.714 0.0484 *
# Residuals 109 187.4 1.719
# We see that region factor variable is signinficant so that at least two of the regions are different in
# terms of infection ratio.
# If we want to know which exact regions are different we can do the pairwise comparision.
# And if we want to be more correct we can adjust the p-value.
pairwise.t.test(Infection.risk, Region, p.adjust.method = "none")
pairwise.t.test(Infection.risk, Region, p.adjust.method = "bonferroni")
# data: Infection.risk and Region
# 1 2 3
# 2 1.000 - -
# 3 0.032 0.859 -
# 4 1.000 1.000 1.000
# P value adjustment method: bonferroni
#
#
# We can see that region (1 and 3) are different after bonferroni adjustment.
TukeyHSD(m, conf.level = 0.90)
# Fit: aov(formula = Infection.risk ~ Region, data = hospitals)
# $Region
# diff lwr upr p adj
# 2-1 -0.4669643 -1.2537701 0.3198415 0.5168684
# 3-1 -0.9336873 -1.6952799 -0.1720946 0.0269952
# 4-1 -0.4794643 -1.4323334 0.4734048 0.6489580
# 3-2 -0.4667230 -1.2007205 0.2672746 0.4563333
# 4-2 -0.0125000 -0.9434612 0.9184612 0.9999891
# 4-3 0.4542230 -0.4555290 1.3639749 0.6545991
#
# Also in Tuky procedure we can see that two regions 3-1 are different.
# Q7: Is number of Nurses in each hospital a significan covariate? Are the differences in different
# region driven by the number of Nureses?
# To answer the above question we need to do ANCOVA and consider the number of nurses as a covariate.
# We check if the number of nurses is a covariate or not.
library(car)
# library(lsmeans) # is depricated
library(estimability)
library(emmeans)
Anova(lm(Infection.risk~Region+Nurses, data=hospitals), type=3)
# > Anova(lm(Infection.risk~Region+Nurses, data=hospitals), type=3)
# Anova Table (Type III tests)
# Response: Infection.risk
# Sum Sq Df F value Pr(>F)
# (Intercept) 329.78 1 223.9144 < 2.2e-16 ***
# Region 11.06 3 2.5028 0.06314 .
# Nurses 28.32 1 19.2287 2.703e-05 ***
# Residuals 159.06 108
# We see that region is no more significant and number of nurses is significant.
# This means that the differences that we see in infection ratios is due to the differences in number of nurses
# and not due to differences in region.
# After adjusting for Nurses we see the Region is not siginificat here
# And Number of nurses is drving here
# We can adjust for number of nurses.
emm_options(contrasts=c("contr.treatment", "contr.poly"))
my.model<-lm(Infection.risk~Region+Nurses, data=hospitals)
lsmeans(my.model, specs = "Region", contr = "pairwise" )
# Q8: Considering the case that Hospitals are affiliated medical school and they are in different reigons,
# are all of these hospitals different in terms of infection risks?
# Med school (Yes or not) is categorical variable.
# We need here to do two-way anova with two categorical variables (region and MedSchool) and one continous variable infection risks.
typeof(Med.school)
# "integer"
# We need to convert it to factor
factorMed.School <- ifelse(Med.school==1, TRUE, FALSE)
typeof(factorMed.School)
model<-lm(Infection.risk~Region + factorMed.School + Region * factorMed.School, data=hospitals)
summary(model)
# We can see from the summary that none of the combinations are siginificant
interaction.plot(Region , Med.school, Infection.risk, col=1:2)
# We can see that the there is some interaction between some of the regions and med school
# Two-way Anova
Anova(model, type=3)