-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathAbdel_Tarricone_R_Script.R
239 lines (186 loc) · 8.75 KB
/
Abdel_Tarricone_R_Script.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
##We start by loading some useful libraries that we will need during this process
library(readr) #Used to read the csv
library(tidyverse) #Contains a lot of useful packages to clean the datas
library(gganimate) #To create animated plots
library(ggthemes) #To select some nice themes
library(zoo) #To fill the N.A. values
library(grid) #To display plots into a grid
library(gridExtra) #To display plots into a grid
library(ggpubr)#To display plots into a grid
library(olsrr)#For model analysis
##We now load the row csv files what we need to merge
Cancer <- read_csv("Datasets/Cancer.csv")
Air_pollution <- read_csv("Datasets/Air pollution.csv")
Alcool <- read_csv("Datasets/Alcool.csv")
GDP_per_capta <- read_csv("Datasets/GDP per capta.csv")
Obesity <- read_csv("Datasets/Obesity.csv")
Old_age_dependency_ratio <- read_csv("Datasets/Old age dependency ratio.csv")
Smoking <- read_csv("Datasets/Smoking.csv")
Population <- read_csv("Datasets/population-since-1800.csv")
countryContinent <- read_csv("Datasets/countryContinent.csv")
#We proceed by taking the explanatory variable data set (Cancer) and adding the covariates as columns other columns, to do that we use the use the libraby "dplyr" as it makes it very easy to do and readable
Continent = countryContinent %>% select(country, continent)
Continentt = rename(Continent, Entity = country)
Cancer1 = left_join(Cancer, Air_pollution, by = c("Entity", "Code", "Year"))
Cancer2 = left_join(Cancer1, Alcool, by = c("Entity", "Code", "Year"))
Cancer3 = left_join(Cancer2, GDP_per_capta, by = c("Entity", "Code", "Year"))
Cancer4 = left_join(Cancer3, Obesity, by = c("Entity", "Code", "Year"))
Cancer5 = left_join(Cancer4, Old_age_dependency_ratio, by = c("Entity", "Code", "Year"))
Cancer6 = left_join(Cancer5, Smoking, by = c("Entity", "Code", "Year"))
Cancer7 = left_join(Cancer6, Continentt, by = "Entity")
Cancer_final = Cancer7
rm("Cancer1", "Cancer2", "Cancer3", "Cancer4", "Cancer5", "Cancer6", "Cancer7")
rm("Air_pollution", "Alcool", "Cancer", "GDP_per_capta", "Obesity", "Old_age_dependency_ratio", "Smoking", "Continent", "Continentt", "countryContinent")
#We now proceed by renaming the columns. This will make all the manipulations of the dataframe more readable:
colnames(Cancer_final)[4] <- "Cancer"
colnames(Cancer_final)[5] <- "AirPoll"
colnames(Cancer_final)[6] <- "Alcool"
colnames(Cancer_final)[7] <- "GDP"
colnames(Cancer_final)[8] <- "Obesity"
colnames(Cancer_final)[9] <- "Age"
colnames(Cancer_final)[10] <- "Smoking"
Cancer_final
write.csv(Cancer_final, "Cancer_final.csv", row.names=FALSE, quote=FALSE) #to save the file into our environment
#We read the file needed
share_of_population_with_cancer <- read_csv("Cancer.csv") #read the file
share_of_population_with_cancer = data.frame(lapply(share_of_population_with_cancer, function(x) {
gsub("United States", "USA", x) #correction needed for the mapping
}))
colnames(share_of_population_with_cancer)[1] <- "region" #Give the same name for the country in the two datasets
colnames(share_of_population_with_cancer)[4] <- "cancer" #Dropping the complicated name given by the data frame
# Create the actual map that we will use
#We will use the ggplo2 package for that
mapdata = map_data("world") #Used to take the latitude and longitude of the countries
mapp <- mapdata %>%
left_join(share_of_population_with_cancer, by = "region") #merge the two datasets
mapp2 = mapp %>%
filter(!is.na(mapp$cancer)) #dealing with NA
mapp2$cancer <- as.numeric(mapp2$cancer) #changing the value to numeric for plotting
mapp2$Year <- as.numeric(mapp2$Year) #changing the value to numeric for animation
## BUILDING THE MAP: Warning: This could take a very long time : (
#Here we create a static map
ggplot( data = mapp2, aes(long, lat, group = subregion)) +
geom_map(
aes(map_id = region),
map = mapdata,
color = "black", fill = "gray30", size = 0.3
) +
geom_polygon(aes(group = group, fill = cancer), color = "black") +
scale_fill_gradient2(low = "blue", high = "red" , midpoint = 1.0355583) +
theme_minimal() +
labs(
title = "Share of population with cancer in 2017",
x = "", y = "", fill = ""
) +
theme_bw()
ggsave("Share of people with cancer in 2017.jpg")
##End of World map plotting
#Some data cleaning
Cancergraph = left_join(Cancer_final, Population, by = c("Entity", "Code", "Year"))
Cancergraph$Smoking = na.locf(Cancergraph$Smoking)
Cancergraph$AirPoll = na.locf(Cancergraph$AirPoll)
Cancergraph$Alcool = na.locf(Cancergraph$Alcool)
Cancergraph$GDP = na.locf(Cancergraph$GDP, fromLast = TRUE)
Cancergraph$Age = na.locf(Cancergraph$Age)
Cancergraph$Obesity = na.locf(Cancergraph$Obesity)
Cancergraph$`Population (historical estimates)` = na.locf(Cancergraph$`Population (historical estimates)`)
#Here we plots
Cancergraph = subset(Cancergraph, Year == "2017")
graphPoll = Cancergraph %>%
ggplot(aes(x = AirPoll, y = Cancer, color = continent, size = `Population (historical estimates)`)) +
geom_point(alpha = 0.9, stroke = 0) +
scale_size(range = c(2,12), guide = "none") +
scale_color_brewer(palette = "Set2") +
labs(title = "Cancer incidence vs Air Pollution",
x = "Air pollution exposure ",
y = "Share of people with Cancer",
color = "Continent",
caption = "Source: Our World in Data") +
labs (subtitle = "Year: 2017}")
graphAlco = Cancergraph %>%
ggplot(aes(x = Alcool, y = Cancer, color = continent, size = `Population (historical estimates)`)) +
geom_point(alpha = 0.9, stroke = 0) +
scale_size(range = c(2,12), guide = "none") +
scale_color_brewer(palette = "Set2") +
labs(title = "Cancer incidence vs Alcool consumption",
x = "Alcool consumed",
y = "Share of people with Cancer",
color = "Continent",
caption = "Source: Our World in Data") +
labs (subtitle = "Year: 2017")
graphGDP = Cancergraph %>%
ggplot(aes(x = GDP, y = Cancer, color = continent, size = `Population (historical estimates)`)) +
geom_point(alpha = 0.9, stroke = 0) +
scale_size(range = c(2,12), guide = "none") +
scale_color_brewer(palette = "Set2") +
labs(title = "Cancer incidence vs GDP",
x = "GDP per capta",
y = "Share of people with Cancer",
color = "Continent",
caption = "Source: Our World in Data") +
labs (subtitle = "Year: 2017")
graphObes = Cancergraph %>%
ggplot(aes(x = Obesity, y = Cancer, color = continent, size = `Population (historical estimates)`)) +
geom_point(alpha = 0.9, stroke = 0) +
scale_size(range = c(2,12), guide = "none") +
scale_color_brewer(palette = "Set2") +
labs(title = "Cancer incidence vs Obesity",
x = "Share of Obese people",
y = "Share of people with Cancer",
color = "Continent",
caption = "Source: Our World in Data") +
labs (subtitle = "Year: 2017")
graphAge = Cancergraph %>%
ggplot(aes(x = Age, y = Cancer, color = continent, size = `Population (historical estimates)`)) +
geom_point(alpha = 0.9, stroke = 0) +
scale_size(range = c(2,12), guide = "none") +
scale_color_brewer(palette = "Set2") +
labs(title = "Cancer incidence vs Age",
x = "Old people ratio",
y = "Share of people with Cancer",
color = "Continent",
caption = "Source: Our World in Data") +
labs (subtitle = "Year: 2017")
graphSmoke = Cancergraph %>%
ggplot(aes(x = Smoking, y = Cancer, color = continent, size = `Population (historical estimates)`)) +
geom_point(alpha = 0.9, stroke = 0) +
scale_size(range = c(2,12), guide = "none") +
scale_color_brewer(palette = "Set2") +
labs(title = "Cancer incidence vs Smoking",
x = "Number of Smokers",
y = "Share of people with Cancer",
color = "Continent",
caption = "Source: Our World in Data") +
labs (subtitle = "Year: 2017")
pplot = ggarrange(graphAge, graphAlco, graphGDP, graphObes, graphPoll, graphSmoke + rremove("x.text"), ncol = 2, nrow = 3)
# Delete the N.A.s
mydata = read.csv("Cancer_final.csv")
data5 = na.omit(mydata)
# Delete USA and Canada from the data set
data3 = subset(data5, Entity != "USA")
#Normality of the residuals
model = lm( Cancer~ Smoking + Obesity + GDP + Age + AirPoll + Alcool, data = data3)
#Analyzing the regression
summary(model)
#Perform Testing for normality of residuals
ols_test_normality(model)
#plot histogram of residuals
kop = residuals.lm(model)
hist(kop, breaks = 50, main = "Histogram of the residuals", xlab = "Residual")
#Check graphically using the function plot
plot(model)
#Model selection: Stepdown
frwfit <- ols_step_forward_p(model, pent = 0.05)
frwfit
#Model selection Stepdown
bkwfit <- ols_step_backward_p(model, prem = 0.05)
bkwfit
#Model Sekection Stepboth
bothfit <- ols_step_both_p(model, pent = 0.05, prem = 0.05)
bothfit
#Model Selection All subset models
allsub2 <- ols_step_best_subset(model)
allsub2
#Model Selection: Best models
allsub2 <- ols_step_best_subset(model)
allsub2