generated from jtr13/EDAVtemplate
-
Notifications
You must be signed in to change notification settings - Fork 1
/
03-cleaning.Rmd
156 lines (116 loc) · 5.18 KB
/
03-cleaning.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
# Data transformation
## Import Data
1) SP500 & Sector Index
```{r}
library(tidyverse)
data_path <- "C:/Users/weili/Desktop/G5293/Final Project/Final_Project.xlsx"
readxl::excel_sheets(path = data_path)
SP500 <- readxl::read_excel(path=data_path,sheet=1)
Energy <- readxl::read_excel(path=data_path,sheet=2)
Info_tech <- readxl::read_excel(path=data_path,sheet=3)
Fin <-readxl::read_excel(path=data_path,sheet=4)
Health_care <- readxl::read_excel(path=data_path,sheet=5)
Consumer_D <- readxl::read_excel(path=data_path,sheet=6)
Utility <- readxl::read_excel(path=data_path,sheet=7)
Consumer_S <- readxl::read_excel(path=data_path,sheet=8)
Industrials <- readxl::read_excel(path=data_path,sheet=9)
Commu_Services <- readxl::read_excel(path=data_path,sheet=10)
Materials <- readxl::read_excel(path=data_path,sheet=11)
Real_Estate <- readxl::read_excel(path=data_path,sheet=12)
```
Pick a dataset to take a brief look at:
```{r}
head(SP500)
```
2) S&P500 Constituents Financials:
```{r}
SP500_comp <- read.csv("C:/Users/weili/Desktop/G5293/Final Project/constituents_financials.csv")
SP500_sector <- readxl::read_xlsx("C:/Users/weili/Desktop/G5293/Final Project/500-sector-representation.xlsx",skip=3)
head(SP500_comp)
head(SP500_sector)
```
## Data Transformation:
### Rename column names
First of all, some names of the columns in the data set are not easy for people to read or understand. Therefore, we change some of the names of the columns:
- For each 11 sectors, we name them by their sector name first and then form the data frame by using “P_name of sector”(ie.P_SP500,P_CommuSer….) instead of “PX_LAST” as shown in the original data set.
- Same for the volume columns, we change the name of “PX_VOLUME” into “V_name of sector” for 11 different sectors.
```{r}
colnames(SP500) <- c("Date","P_SP500","V_SP500","R_SP500")
colnames(Commu_Services) <- c("Date","P_CommuSer","V_CommuSer","R_CommuSer")
colnames(Consumer_D) <- c("Date","P_ConsumerD","V_ConsumerD","R_ConsumerD")
colnames(Consumer_S) <- c("Date","P_ConsumerS","V_ConsumerS","R_ConsumerS")
colnames(Energy) <- c("Date","P_Energy","V_Energy","R_Energy")
colnames(Fin) <- c("Date","P_Fin","V_Fin","R_Fin")
colnames(Health_care) <- c("Date","P_Health","V_Health","R_Health")
colnames(Industrials) <- c("Date","P_Industrial","V_Industrial","R_Industrial")
colnames(Info_tech) <- c("Date","P_Infotech","V_Infotech","R_Infotech")
colnames(Materials) <- c("Date","P_Materials","V_Materials","R_Materials")
colnames(Real_Estate) <- c("Date","P_RealEstate","V_RealEstate","R_RealEstate")
colnames(Utility) <- c("Date","P_Utility","V_Utility","R_Utility")
```
```{r}
SP500_sector <- SP500_sector%>%
dplyr::rename("Sector"="MARKET REPRESENTATION",
"election_1"="election...6",
"election_2"="election...9")%>%
head(-2)
colnames(SP500_sector) <- gsub(" ","_",colnames(SP500_sector))
SP500_sector <- SP500_sector[SP500_sector$Sector !="S&P 500",]
colnames(SP500_comp) <- gsub("[.]","_",colnames(SP500_comp))
print(colnames(SP500_sector))
print(colnames(SP500_comp))
```
### Merge into one dataset
```{r}
USStock <- SP500%>%
left_join(Commu_Services,by="Date")%>%
left_join(Consumer_D,by="Date")%>%
left_join(Consumer_S,by="Date")%>%
left_join(Energy,by="Date")%>%
left_join(Fin,by="Date")%>%
left_join(Health_care,by="Date")%>%
left_join(Industrials,by="Date")%>%
left_join(Info_tech,by="Date")%>%
left_join(Materials,by="Date")%>%
left_join(Real_Estate,by="Date")%>%
left_join(Utility,by="Date")
head(USStock)
```
### Seperate Price and Volume Data
```{r}
USStock_Price <- USStock%>%
select(Date,starts_with("P_"))
USStock_Volume <- USStock%>%
select(Date,starts_with("V_"))
USStock_Return <- USStock%>%
select(Date,starts_with("R_"))
head(USStock_Price)
head(USStock_Volume)
head(USStock_Return)
```
Moreover, we have one small add-ons of the data. Although it is not data transformation on the original ones that we directly get from the website. We indeed make some changes. We add one more column for calculating the returns by using the formula: (Pi/P1/6/17)-1 where i is the date price.
```{r}
APPLE <- read.csv("C:/Users/weili/Desktop/G5293/Final Project/AAPL.csv")
Google <- read.csv("C:/Users/weili/Desktop/G5293/Final Project/GOOG.csv")
MSFT <- read.csv("C:/Users/weili/Desktop/G5293/Final Project/MSFT.csv")
Facebook <- read.csv("C:/Users/weili/Desktop/G5293/Final Project/FB.csv")
head(APPLE)
APPLE <- APPLE%>%
select(Date,Close)%>%
dplyr::rename(APPLE_close=Close)%>%
mutate(Apple_return = c(-diff(APPLE$Close)/APPLE$Close[-1]*100,NA))
head(APPLE)
Google <- Google%>%
select(Date,Close)%>%
mutate(Google_return = c(-diff(Google$Close)/Google$Close[-1] * 100, NA))
colnames(Google) <- c("Date","Google_close","Google_return")
MSFT <- MSFT%>%
select(Date,Close)%>%
mutate(MSFT_return = c(-diff(MSFT$Close)/MSFT$Close[-1] * 100, NA))
colnames(MSFT) <- c("Date","MSFT_close","MSFT_return")
Facebook <- Facebook%>%
select(Date,Close)%>%
mutate(FB_return = c(-diff(Facebook$Close)/Facebook$Close[-1] * 100, NA))
colnames(Facebook) <- c("Date","FB_close","FB_return")
```
Other than what I mentioned above, that’s all the changes we made on the original data. There are no further transformations on the data.