forked from krutalp/Final_Project_Covid_Data
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Project_Code.rmd
235 lines (139 loc) · 7.62 KB
/
Project_Code.rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
---
title: "Project_Code"
author: "Team 2"
date: "1/13/2023"
output: html_document
---
```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE)
```
```{r}
# import libraries
library(ggplot2)
library(dplyr)
# read the first data
case_count <- read.csv('us-states.csv')
head(case_count)
```
```{r}
alb <- dplyr::filter(case_count, state == "Alabama")
sum(alb$cases)
```
```{r}
case_count <- case_count %>% dplyr::mutate(date = as.Date(date) )
# filter dataset by state and case totals
state_count <- do.call(data.frame,
aggregate(cases ~ state, case_count, FUN = function(x) c(max(x))))
head(state_count)
```
Exploratory Data Analysis on US-States Dataset
```{r, fig.height= 3 , fig.width= 5}
# Create a bar graph of total case count by each US state
#ggsave("plot.png", width = 5, height = 5) Saves last plot
#as 5’ x 5’ file named "plot.png" in working directory.
#Matches file type to file extension.
ggplot(state_count) +
geom_bar( aes(x=state, y=cases), stat="identity", fill="red", alpha=0.7) + ggtitle('Total Cases per US State/Territory') + theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1)) + theme(plot.title = element_text(hjust = 0.5))
```
```{r}
library(lubridate)
class(ymd("2021-04-19"))
```
```{r}
state_case_totals <- function(region) {
filtered_data <- dplyr::filter(case_count, state == region)
filtered_data[,"cum_cases"] <- cumsum(filtered_data$cases)
label <- paste('Total Number of Covid Cases over Time in', region)
ggplot(data = filtered_data, aes(x=date, y=cases)) + geom_point(color='red') + ggtitle(label) + xlab("Date") + ylab("Number of Cases") + scale_x_date(breaks = scales::pretty_breaks(n=20)) + theme(axis.text.x = element_text(angle = 90, vjust = 0.5)) + geom_text(aes(x=ymd("2021-04-1"), label="Covid Vaccines Avaliable", y=max(filtered_data$cases) - median(filtered_data$cases)), colour="blue", angle=90) + geom_vline(xintercept= ymd("2021-04-19") ,lwd=0.5,colour="blue",label_value("Covid Vaccines Avaliable")) + theme(plot.title = element_text(hjust = 0.5))
}
```
Type any state name to visualize the number of Total/Cumulative cases over time
```{r}
# Visualizations for each of the US States and territories
state_case_totals("New York")
```
Exploratory Data Analysis on Vaccine Datasets
1. data cleaning: read the csvs, contain null values, fix data types
```{r}
# read in vaccination data by vaccine type
jansen_vaccines <- read.csv("COVID-19_Vaccine_Distribution_Allocations_by_Jurisdiction_-_Janssen.csv")
pfizer_vaccines <- read.csv("COVID-19_Vaccine_Distribution_Allocations_by_Jurisdiction_-_Pfizer.csv")
moderna_vaccines <- read.csv("COVID-19_Vaccine_Distribution_Allocations_by_Jurisdiction_-_Moderna.csv")
# for each vaccine dataset, rename the cols to specify the vaccine type
names(jansen_vaccines)[names(jansen_vaccines) == 'X1st.Dose.Allocations'] <- 'Jansen_1st_Dose_Counts'
names(pfizer_vaccines)[names(pfizer_vaccines) == 'X1st.Dose.Allocations'] <- 'Pfizer_1st_Dose_Counts'
names(pfizer_vaccines)[names(pfizer_vaccines) == 'X2nd.Dose.Allocations'] <- 'Pfizer_2nd_Dose_Counts'
names(moderna_vaccines)[names(moderna_vaccines) == 'X1st.Dose.Allocations'] <- 'Moderna_1st_Dose_Counts'
names(moderna_vaccines)[names(moderna_vaccines) == 'X2nd.Dose.Allocations'] <- 'Moderna_2nd_Dose_Counts'
```
Visualize the total number of new vaccine doeses over time for a particular state
```{r}
# Visualization over time for total vaccine doses for each state
merged_state_df1 <- full_join(pfizer_vaccines, moderna_vaccines, by=c('Week.of.Allocations','Jurisdiction'))
# remove dose 1 for pfizer and moderna columns
merged_state_df1 <- merged_state_df1[-c(3,5)]
merged_state_df2 <- full_join(merged_state_df1, jansen_vaccines, by=c('Week.of.Allocations','Jurisdiction'))
# replace Null values with quantity 0
merged_state_df2[is.na(merged_state_df2)] = 0
# covert columns to numeric
new_Df <- merged_state_df2 %>% mutate(Pfizer_2nd_Dose_Counts = as.numeric(Pfizer_2nd_Dose_Counts), Moderna_2nd_Dose_Counts =
as.numeric(Moderna_2nd_Dose_Counts), Jansen_1st_Dose_Counts = as.numeric(Jansen_1st_Dose_Counts), Week.of.Allocations = as.Date(Week.of.Allocations, "%m/%d/%Y"))
new_Df <- new_Df %>% arrange((Week.of.Allocations))
View(new_Df)
```
```{r}
vaccinations_by_state <- function(region) {
df_filtered <- dplyr::filter(new_Df, Jurisdiction == region)
label <- paste('Vaccination Allocations over Time in', region)
# get total of vaccines by summing over each type of vaccine type
df_filtered <- df_filtered %>%
mutate(vaccine_sum = Pfizer_2nd_Dose_Counts + Moderna_2nd_Dose_Counts + Jansen_1st_Dose_Counts)
# compute cumulative number of vaccines given
df_filtered[,"cum_doses"] <- cumsum(df_filtered$vaccine_sum)
ggplot(data = df_filtered, aes(x = Week.of.Allocations, y = cum_doses)) + geom_point() + geom_line(color = 'blue') + ggtitle(label) + xlab("Week of Allocations") + ylab("Number of Total Doses") + theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1)) + scale_x_date(date_labels = "%Y %b") + theme(plot.title = element_text(hjust = 0.5))
}
```
```{r,fig.height= 3 , fig.width= 5}
vaccinations_by_state("New Jersey")
```
Create Bar Graph to show total vaccinations for each state accross all times
```{r, fig.height= 3 , fig.width= 5}
state_totals <- new_Df %>%
mutate(vaccine_sum = Pfizer_2nd_Dose_Counts + Moderna_2nd_Dose_Counts + Jansen_1st_Dose_Counts)
# filter dataset by state and case totals
state_totals <- do.call(data.frame,
aggregate(vaccine_sum ~ Jurisdiction, state_totals, FUN = function(x) c(sum(x))))
# Clean the data by combining all state data over time similar to what we did above with the covid cases data
ggplot(state_totals) +
geom_bar( aes(x=Jurisdiction, y=vaccine_sum), stat="identity", fill="blue", alpha=0.7) + ggtitle('Total Vaccines Administered per US State/Territory') + theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1)) + theme(plot.title = element_text(hjust = 0.5))
```
```{r, fig.height= 3 , fig.width= 5}
# Create map data for US Cases
library(usmap)
plot_usmap(data = state_count, values = "cases", color = "black") +
scale_fill_continuous(low = 'white', high = 'red', name = "cases", label = scales::comma) +
theme(legend.position = "right") + ggtitle("Total US Covid Cases") + theme(plot.title = element_text(hjust = 0.5))
```
```{r, fig.height= 3 , fig.width= 5}
# Map plot for covid vaccinations for each US State
# Create map data for US Cases
colnames(state_totals)[1] <- "state"
plot_usmap(data = state_totals, values = "vaccine_sum", color = "black") +
scale_fill_continuous(low = 'gray', high = 'blue', name = "Vaccines", label = scales::comma) +
theme(legend.position = "right") + ggtitle("Total US Covid Vaccinations") + theme(plot.title = element_text(hjust = 0.5))
```
Analysis Comments
1. Strong relationship between States with high Covid cases with states that received higher total vaccine allocations
```{r}
# compute correlation coef R^2 between the two data
final_df <- merge(state_count, state_totals, by = "state")
final_df
```
Correlation Test
```{r}
covid_cases_Data <- final_df$cases
covid_vacc_Data <- final_df$vaccine_sum
corr_coef <- cor(covid_cases_Data, covid_vacc_Data)
print(corr_coef)
```
A correlation coef value of 0.977 indicates a strong correlation between the number of covid cases and the number of vaccines allocated to that respective state. This supports the conclusion regarding the governments overall strategy to supply more vaccines to states with higher populations and a large load of covid cases.