-
Notifications
You must be signed in to change notification settings - Fork 0
/
LauraVoddenHDI.R
352 lines (249 loc) · 11.9 KB
/
LauraVoddenHDI.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
# Vodden, Laura MA5820
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~#
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ASSIGNMENT 3 CAPSTONE PROJECT R CODE~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~#
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~#
###########################################################################################################
# Set libraries
library(tidyr)
library(dplyr)
library(ggplot2)
library(ggpubr)
# RStudio Version
RStudio.Version()
# Set working directory
setwd('C:/Users/laura/OneDrive/Desktop/MA5820/Assignment_3')
# Import HDI data
HDI_Data <- read.csv("hdi_human_development_index.csv")
CO2_per_person_Data <- read.csv("co2_emissions_tonnes_per_person.csv")
Total_CO2_Data <- read.csv("yearly_co2_emissions_1000_tonnes.csv")
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~#
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~DATA CLEANING~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~#
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~#
# Select 1990-2014 data for HDI, CO2, Forest and Land data
HDI_Data <- select(HDI_Data,-c(2:25,27))
names(HDI_Data) <- gsub("X2014", "HDI", names(HDI_Data))
HDI_Data
HDI_num_Data <- HDI_Data
names(HDI_num_Data) <- gsub("HDI", "HDI_num", names(HDI_num_Data))
HDI_num_Data
CO2_per_person_Data <- select(CO2_per_person_Data,-c(2:215))
names(CO2_per_person_Data) <- gsub("X2014", "CO2_pp", names(CO2_per_person_Data))
CO2_per_person_Data
Total_CO2_Data <- select(Total_CO2_Data,-c(2:264))
names(Total_CO2_Data) <- gsub("X2014", "Total_CO2", names(Total_CO2_Data))
Total_CO2_Data
# For HDI_Data, convert numerical values to categorical variables
# Low: 0.0-0.5
# Medium: 0.5-0.8
# High: 0.8-1.0
HDI_Data$HDI[HDI_Data$HDI < 0.5] = "Low"
HDI_Data$HDI[HDI_Data$HDI < 0.8] = "Medium"
HDI_Data$HDI[HDI_Data$HDI < 1.0] = "High"
# Merge HDI, CO2 and forest dataframes
# 135 obs of 76 variables
Data_2014 <- left_join(HDI_Data, CO2_per_person_Data)
Data_2014 <- left_join(Data_2014, Total_CO2_Data)
Data_2014
# Remove missing data
Data_2014 <- na.omit(Data_2014)
# Create population variable
Data_2014$Population <- (Data_2014$Total_CO2)*1000 / Data_2014$CO2_pp
# Ensure low, medium and high stay in order for plots
Data_2014$HDI <- factor(Data_2014$HDI,levels = c("Low", "Medium", "High"))
# Generate random dataset of 100 for following objectives
set.seed(100)
index <- sample(1:nrow(Data_2014), 100)
index
Data_2014 <- Data_2014[index, ]
Data_2014
write.csv(Data_2014,'Data_2014.csv')
###################################################################################################################
# Objective 1
# Test whether there is a difference between total CO2 emissions in high and low HDI countries
# Remove 'medium' HDI countries form dataset
High_Low_HDI <- Data_2014[!(Data_2014$HDI=="Medium"),]
# Exploratory data analysis
HDI_Total_CO2_plot <- ggplot(data = High_Low_HDI) +
geom_boxplot(mapping = aes(x = HDI, y = Total_CO2), outlier.size = 1) +
ylim(0,60000)
HDI_Total_CO2_plot + labs(title = "Total CO2 Emissions by HDI Category")
# State the hypotheses
# The null hypothesis is that the population median for total CO2 emissions for high HDI countries
# is different from the population median for total CO2 emissions for low HDI countries.
# Alternative hypothesis is that there is a difference.
# State the significance level
# This is the probability of committing a type I error. We use the common significance level of 5% here.
# Therefore ??=0.05.
# Check assumptions (at least 11 cases for both low and high HDI)
# Perform two-sample Wilcoxon test on new dataset
zw=wilcox.test(Total_CO2 ~ HDI, data=High_Low_HDI,alternative="two.sided",mu=0)
zw
# Sometimes if there are tied ranks, R will give an error message noting that
# the exact p-value can not be computed and is thus an approximation
# p-value = 0.0001043
# There is sufficient evidence to suggest that high HDI countries have greater CO2
# emissions than low HDI countries.
###################################################################################################################
# Objective 2
#Test whether CO2 emissions per person increases countries with HDI
Data_2014
qqnorm(Data_2014$CO2_pp)
qqline(Data_2014$CO2_pp)
# Remove outliers
mean_Data_2014 <- mean(Data_2014$CO2_pp)
sd_Data_2014 <- mean_Data_2014*2.2
Data_2014_OM <- Data_2014[ which(Data_2014$CO2_pp < sd_Data_2014), ]
qqnorm(Data_2014_OM$CO2_pp)
qqline(Data_2014_OM$CO2_pp)
# Subset data
Low_HDI <- subset(Data_2014_OM, HDI == "Low")
High_HDI <- subset(Data_2014_OM, HDI == "High")
par(mfrow=c(1,2))
qqnorm(Low_HDI$CO2_pp)
qqline(Low_HDI$CO2_pp)
qqnorm(High_HDI$CO2_pp)
qqline(High_HDI$CO2_pp)
# State the hypotheses
# The null hypothesis is that there is no difference in CO2 emissions between the two HDI categories.
# That is, the mean of CO2 emissions is the same for both HDI categories.
#The alternative hypothesis is that the mean CO2 emissions are higher in the high HDI category compared to
# the low HDI category.
#H0:??Low = ??High
#HA:??High > ??Low
# State the significance level
# This is the probability of committing a type I error. We use the common significance level of 5% here.
# Therefore ??=0.05.
#The two-sample t-test assumes:
# The data are a simple random sample. We assume that the cabbages grown are a representative
# sample of the entire population of cabbages of the same cultivar type.
#Observations come from a population that is normally distributed. The Q-Q plot of the sample
# data, shown above, indicates that this is a reasonable assumption.
# Perform a significance test
# two-sample T-test
sd(Low_HDI$CO2_pp)
sd(High_HDI$CO2_pp)
# These are reasonably close to allow us to assume equal variances in the calculation of the
# standard error. Hence we use the formula:
xbar <- mean(High_HDI$CO2_pp)
ybar <- mean(Low_HDI$CO2_pp)
sx <- sd(High_HDI$CO2_pp)
sy <- sd(Low_HDI$CO2_pp)
nx <- nrow(High_HDI)
ny <- nrow(Low_HDI)
spsqr <- ((nx - 1)*sx^2 + (ny - 1)*sy^2) / (nx + ny - 2)
std.err <- sqrt(spsqr * (1/nx + 1/ny))
tstatistic <- (xbar - ybar) / std.err
tstatistic
degrees.freedom <- nx + ny - 2
print(paste("The t-statistic is",signif(tstatistic,4)))
print(paste("There are",degrees.freedom,"degrees of freedom."))
# Determine the rejection region
alpha = 0.05
tstar <- qt(1 - alpha, df=degrees.freedom)
tstar
print(paste("tstar is",signif(tstar,4)))
# The tstatistic lies in the rejection region, so we reject the null hypothesis.
par(mfrow=c(1,2))
qqnorm(Data_2014$CO2_pp)
qqline(Data_2014$CO2_pp)
qqnorm(Data_2014_OM$CO2_pp)
qqline(Data_2014_OM$CO2_pp)
degrees.freedom
tstatistic
tstar
###################################################################################################################
# Objective 3
# Regression analysis
# eg
# Test whether Population pp can be used to predict Total_CO2
# Perform a linear regression analysis, using Total_CO2 as the response variable
# and Population as the predictor variable.
# Exploratory data analysis
# Normality
par(mfrow=c(2,2))
boxplot((Data_2014$Population), main="Population",
ylab="Count")
boxplot((Data_2014$Total_CO2), main="Total CO2 (1000 metric t)",
ylab="Count")
qqnorm(Data_2014$Population)
qqline(Data_2014$Population)
qqnorm(Data_2014$Total_CO2)
qqline(Data_2014$Total_CO2)
# Remove outliers
Data_2014_OM3 <- Data_2014[ which(Data_2014$Population < 10000000), ]
par(mfrow=c(2,2))
boxplot((Data_2014_OM3$Population), main="Population",
ylab="Count")
boxplot((Data_2014_OM3$Total_CO2), main="Total CO2 (1000 metric t)",
ylab="Count")
qqnorm(Data_2014_OM3$Population)
qqline(Data_2014_OM3$Population)
qqnorm(Data_2014_OM3$Total_CO2)
qqline(Data_2014_OM3$Total_CO2)
# Both Total_CO2 and Population data are approximately normally distributed.
# Scatterplot to visualise the relationship between Population and Total_CO2, remembering
# that Total_CO2 is the response and should be plotted on the vertical axis.
par(mfrow=c(1,1))
plot(Data_2014_OM3$Population, Data_2014_OM3$Total_CO2, main="Population vs Total_CO2", xlab="Population", ylab="Total_CO2")
# Outliers have already been removed from the dataset
# Estimate the parameters using inbuilt R functions
zmodel <- lm(Total_CO2 ~ Population, data=Data_2014_OM3)
summary(zmodel)
# Compute the fitted values of the statistical model
Total_CO2.fitted <- predict(zmodel)
par(mar=par()$mar + c(0,1,0,0))
plot(Total_CO2.fitted, Data_2014_OM3$Total_CO2, main="Total_CO2 fitted values", xlab="Fitted values of Total_CO2", ylab="Actual values of Total_CO2")
abline(0,1, lty=2)
# If the fitted values were identical to the actual values, all the points would fall
# on the line. This is not quite the case, but there is some correspondence between
# the fitted values and the observed values of Total_CO2.
# Compute the residuals of the linear model
residuals <- resid(zmodel)
# After forming a linear model there are several aspects we should verify:
# Check that the assumptions of the residuals are satisfied.
# Check the R2 to determine if the explanatory power of the model is useful
# (we do not want to use a model to make predictions if the R2 is too small).
# Check that the model passes important significance tests about the population
# slope and population R2.
# Test the normal distribution of the residuals:
qqnorm(residuals)
qqline(residuals)
# The Q-Q plot shows the residuals are approximately normal.
# To test for constant variance we create a scatterplot of the residuals against
# the fitted values. Ideally you should not see a pattern and the points should be
# randomly scattered around the horizontal axis.
par(mfrow=c(1,2))
plot(Total_CO2.fitted, residuals, main="Constant variance", xlab="Fitted Total_CO2", ylab="Residual (years)")
abline(h=0, lty=2)
# The scatterplot shows that there is no discernible pattern to the residuals and
# that it is therefore reasonable to assume the residuals meet the assumption of
# constant variance.
# Finally, we check the independence of the residuals by plotting them in the sequence
# in which they appeared in the data. We do not want this plot to show any trending or
# cyclic features.
plot(residuals, type='b', main="Independence", ylab="Residual")
abline(h=0, lty=2, col='grey')
# There is no discernible pattern in the residuals which leads us to believe that the
# assumption of independence for the residuals is valid.
# Compute the R squared value
# Test the significance
summary(zmodel)
#The R2 value indicates that 43.6% of the variability in CO2 emissions expectancy is
#explained by the linear model.
# The p-value here is very small, giving us confidence that these data come from a
# population whose R2 is different to zero. Equivalently, we can also be confident that
# these data come from a population where there is a trend between Population
# and Total_CO2.
# But not a very good one.
# Show the line of best fit and confidence and prediction bands for the model
xgrid <- seq(0, 80000000, length.out=101)
xgridframe <- data.frame(Population=xgrid)
pc <- predict(zmodel, xgridframe, interval="confidence")
pp <- predict(zmodel, xgridframe, interval="prediction")
fit <- pc[,1]
par(mfrow=c(1,1))
plot(Data_2014_OM3$Population, Data_2014_OM3$Total_CO2, main="Population as a predictor of total CO2 emissions", xlab="Population", ylab="Total_CO2")
lines(xgrid, fit)
matlines(xgrid, pc[,-1], col=1, lty=2)
matlines(xgrid, pp[,-1], col=1, lty=3)
legend(50000,70000, c("Line of best fit", "Confidence interval", "Prediction interval"), lty=c(1,2,3))