-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcapstone.R
309 lines (248 loc) · 13.8 KB
/
capstone.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
# Brett Waugh
# 12 April 2019
# capstone.R
# Different analytic techniques to predict the
# number of Starbuck's in an area.
# Data was retrieved from: https://www.kaggle.com/starbucks/store-locations
#########################################################################################################
########################################### Data & Libraries ############################################
#########################################################################################################
# Necessary libraries.
require(readr)
require(ggplot2)
require(dplyr)
require(stats)
require(plyr)
require(cluster)
require(lattice)
require(graphics)
require(grid)
require(gridExtra)
require(mgcv)
require(caret)
require(e1071)
require(boot)
require(ISLR)
require(arm)
# Load in the data. Data was provided by: https://www.kaggle.com/starbucks/store-locations
directory <- read_csv("~/Documents/IDS4934/capstone/directory.csv")
# States and counts calculated in Splunk.
starbucksByState <- read_csv("~/Documents/IDS4934/capstone/starbucksByState.csv")
# Make into a dataframe.
df <- as.data.frame(directory)
# Set seed for study.
set.seed(950)
#########################################################################################################
############################################# Data filtering ############################################
#########################################################################################################
# Only keep stores that are Starbucks. Teavana stores were originally included.
df1 <- subset(df, Brand=="Starbucks")
# Verify that only Starbucks stores remain.
nrow(directory[directory$Brand!="Starbucks",])
# Drop rows that do not matter to the analysis.
df2 <- subset(df1, select = -c(`Store Number`,`Store Number`, `Store Name`, `Ownership Type`, `Street Address`, `Phone Number`, `Timezone`))
# Verify that those fields were dropped.
str(df2)
# Only keep stores that are in the United States. Dataset provided worldwide locations.
df3 <- subset(df2, Country=="US")
# Verify that only US locations were kept.
nrow(df3[df3$Country!="US",])
#########################################################################################################
############################################# First Method #############################################
#########################################################################################################
# Is the mean number of Starbucks in the US a good measure for the number of Starbucks across the US?
# Average number of Starbucks across the US.
meanStarbucks <- nrow(df3)/nrow(starbucksByState)
# Histogram of "Number of Starbucks by State". G1.
ggplot(df3, aes(x=df3$`State/Province`, col=df3$`State/Province`, fill=df3$`State/Province`)) +
geom_histogram(stat="count") +
theme(axis.text.x = element_text(angle = -90), legend.position = "none") +
labs(x = "State", y = "Number of Starbucks", title = "Number of Starbucks by State", caption = "based on data from: https://www.kaggle.com/starbucks/store-locations")
# Histogram of "Number of Starbucks by State" with mean line. G2.
ggplot(df3, aes(x=df3$`State/Province`, col=df3$`State/Province`, fill=df3$`State/Province`)) +
geom_histogram(stat="count") +
theme(axis.text.x = element_text(angle = -90), legend.position = "none") +
labs(x = "State", y = "Number of Starbucks", title = "Number of Starbucks by State", caption = "based on data from: https://www.kaggle.com/starbucks/store-locations") +
geom_hline(yintercept=meanStarbucks)
# Accuracy for first method.
# No states match this mean! Closest results are: MA has 262, MD has 252, MI has 272, NV has 249, and NJ has 249.
method1Result <- nrow(starbucksByState[starbucksByState$count==meanStarbucks,])/nrow(starbucksByState)
method1Result
#########################################################################################################
############################################# Second Method ############################################
#########################################################################################################
# Does removing outliers and looking inside one standard devaiation improve our chances at predicting
# the number of Starbucks in an area?
# Bowplot to determine outliers. G4.
ggplot(starbucksByState, aes(y=starbucksByState$count)) +
geom_boxplot(outlier.colour = "red", show.legend = NA, outlier.shape = 8, outlier.size = 5) +
theme(legend.position = "none") +
labs(y = "Number of Starbucks", title = "Distribution of Starbucks in United States", caption = "based on data from: https://www.kaggle.com/starbucks/store-locations")
# Standard deviation.
allSD <- sd(starbucksByState$count)
allSD
# Data without outliers: CA, TX, WA is 173.17.
sb1 <- subset(starbucksByState, State_Province!="CA" & State_Province!="TX" & State_Province!="WA")
sb1
# Mean of data without outliers.
meanSB1 <- mean(sb1$count)
meanSB1
# Standard deviation of data without outliers.
sdSB1 <- sd(sb1$count)
sdSB1
# Upper and lower limits.
upSB1 <- meanSB1 + sdSB1
upSB1
loSB1 <- meanSB1 - sdSB1
loSB1
# Histogram of "Number of Starbucks by State" within one standard deviation (93.05,439.39). G5.
ggplot(sb1, aes(x=sb1$State_Province, y=sb1$count, col=sb1$State_Province, fill=sb1$State_Province)) +
geom_col() +
theme(axis.text.x = element_text(angle = -90), legend.position = "none") +
labs(x = "State", y = "Number of Starbucks", title = "Number of Starbucks by State", caption = "based on data from: https://www.kaggle.com/starbucks/store-locations") +
geom_hline(yintercept = upSB1) +
geom_hline(yintercept = loSB1)
# Boxplot for "Distribution of Starbucks in United States without CA, TX, and WA". G6.
ggplot(sb1, aes(y=sb1$count)) +
geom_boxplot(outlier.colour = "red", show.legend = NA, outlier.shape = 8, outlier.size = 5) +
theme(legend.position = "none") +
labs(y = "Number of Starbucks", title = "Distribution of Starbucks in United States without CA, TX, WA", caption = "based on data from: https://www.kaggle.com/starbucks/store-locations")
# Accuracy for second method.
method2Result <- nrow(sb1[sb1$count<=upSB1 & sb1$count>=loSB1,])/nrow(starbucksByState)
method2Result
#########################################################################################################
############################################# Third Method #############################################
#########################################################################################################
# Linear Regression
# Can I use linear regression on lat/long to predict Starbucks locations?
# Map showing "Linear Regression on Starbucks locations". G9.
ggplot(df3, aes(x=Longitude, y=Latitude)) +
geom_smooth(method="lm") +
geom_point() +
labs(x="Longitude", y = "Latitude", title = "Linear Regression on Starbucks locations", caption = "based on data from: https://www.kaggle.com/starbucks/store-locations")
# Correlation between latitude and longitude.
cor(df3$Longitude, df3$Latitude)
# Creating the linear model.
linearMod <- lm(df3$Longitude ~ df3$Latitude, data=df3)
summary(linearMod)
#########################################################################################################
############################################# More data ################################################
#########################################################################################################
# By using more data than what is provided, we may be able to more advanced techniques with better
# results.
# Used Splunk to combine files into a single file. Sources for the file are from several different
# locations.
addData <- read_csv("Documents/IDS4934/capstone/addData.csv")
#########################################################################################################
############################################# Fourth Method ############################################
#########################################################################################################
# Multivariate Linear Regression
# Can I use multivariate linear regression on the additional data to predict Starbucks locations?
# First run to see which features are relevant.
linFit <- lm(addData$starbucks ~ addData$income + addData$numUni + addData$population + addData$crime, data=addData)
summary(linFit)
# Determine the RMSE of the linear model.
linFitRMSE <- sqrt(mean(linFit$residuals^2))
linFitRMSE
# First run to see which features are relevant.
linFit2 <- lm(addData$starbucks ~ addData$income + addData$numUni + addData$population, data=addData)
summary(linFit2)
# Determine the RMSE of the linear model.
linFit2RMSE <- sqrt(mean(linFit2$residuals^2))
linFit2RMSE
#########################################################################################################
############################################# Fifth Method #############################################
#########################################################################################################
# Logistic Regression
# Can I use logistic regression on the additional data to predict Starbucks locations?
# First logistic model including all fields in additional data.
logmod1 <- glm(addData$starbucks ~ addData$income + addData$numUni + addData$population + addData$crime, data=addData)
summary(logmod1) # display results
# Calculate RMSE for the first logistic model.
logmod1RMSE <- sqrt(mean(logmod1$residuals^2))
logmod1RMSE
# Second logistic model, excluding crime rate from the additional data.
logmod2 <- glm(starbucks ~ income + numUni + population, data=addData)
summary(logmod2)
# Calculate RMSE for the second logistic model.
logmod2RMSE <- sqrt(mean(logmod2$residuals^2))
logmod2RMSE
# Third logistic model (Bayesian Generalized Linear Model), excludes crime rate from additional data.
# Also performs ten fold cross validation on data.
ControlParameters <-trainControl(method="repeatedcv",
number=10,
repeats=10)
logmod3 <-train(starbucks ~ income + numUni + population,
data=addData,
method='bayesglm',
trControl= ControlParameters
)
logmod3
logmod3RMSE <- 201.2931
###########################################################################################################
############################################# Sixth Method ###############################################
###########################################################################################################
# Support Vector Machines (SVM) Models
# Can I use an SVM model on the new data to predict Starbucks locations?
# Put data into a dataframe.
df4 <- as.data.frame(addData[2:5])
# Create a training and testing set.
svm_size <- round(.8 * dim(df4)[1])
svm_train <- df4[1:svm_size,]
svm_test <- df4[-(1:svm_size),]
# First SVM model with C=1, and ten fold cross validation.
svmMod = svm(df4$starbucks ~ df4$income + df4$numUni + df4$population, data = df4, cost=10, epsilon=1, cross=10, scale=F, kernal='radial')
print(svmMod)
# Results of training set.
svmModPred <- predict(svmMod, svm_train)
errval <- df4$starbucks - svmModPred
svm_RMSE <- RMSE(errval, obs=df4$starbucks)
print(paste('SVM RMSE: ', svm_RMSE))
# Grid search for best Epsilon and Cost values. Made with help from: https://rpubs.com/richkt/280840
tuneResult1 <- tune(svm, df4$starbucks ~ df4$income + df4$numUni + df4$population, data = df4,
ranges = list(epsilon = seq(0,1,0.01), cost = seq(0.01,5,0.05))
)
# Map tuning results
plot(tuneResult1)
# Continuation of Grid Search.
tuneResult <- tune(svm, df4$starbucks ~ df4$income + df4$numUni + df4$population, data = df4,
ranges = list(epsilon = seq(tuneResult1$best.model$epsilon*1.01,
tuneResult1$best.model$epsilon*1.1,
length.out = 10),
cost = seq(tuneResult1$best.model$cost-1,
tuneResult1$best.model$cost+1,
length=10)))
plot(tuneResult)
print(tuneResult)
# Final SVM model with values from tuneResult.
svmModTuned = svm(df4$starbucks ~ df4$income + df4$numUni + df4$population, data = df4, cost=tuneResult$best.parameters$cost, epsilon=tuneResult$best.parameters$epsilon, cross=10, scale=F, kernal='radial')
print(svmModTuned)
# Results of training set.
svmModPredTuned <- predict(svmModTuned, svm_train)
errvalTuned <- df4$starbucks - svmModPredTuned
svm_RMSETuned <- RMSE(errvalTuned, obs=df4$starbucks)
print(paste('Tuned SVM RMSE: ', svm_RMSETuned))
###########################################################################################################
########################################### Results from Models ###########################################
###########################################################################################################
# Create a dataframe for the model results.
resultsDF <- data.frame(model=c(linFitRMSE,
linFit2RMSE,
logmod1RMSE,
logmod2RMSE,
logmod3RMSE,
svm_RMSE,
svm_RMSETuned),
seq=1:7,
names=c("Linear Model (all features)",
"Linear Model (without crime)",
"GLM (all features)",
"GLM (without crime)",
"Bayesian GLM (without crime)",
"SVM (without crime)",
"SVM (without crime, tuned)"))
# Graph of models and RMSE.
ggplot(resultsDF, aes(names, model, col=names)) +
geom_point(size=6) +
labs(x="Model Number", y = "RMSE", title = "Model Performance") +
theme(axis.text.x = element_text(angle = -45))